# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from plotly.offline import init_notebook_mode, iplot, plot
import plotly as py
init_notebook_mode(connected=True)
import plotly.graph_objs as go
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
df = pd.read_csv('Clean_Dataset.csv')
df.head()
| Unnamed: 0 | airline | flight | source_city | departure_time | stops | arrival_time | destination_city | class | duration | days_left | price | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | SpiceJet | SG-8709 | Delhi | Evening | zero | Night | Mumbai | Economy | 2.17 | 1 | 5953 |
| 1 | 1 | SpiceJet | SG-8157 | Delhi | Early_Morning | zero | Morning | Mumbai | Economy | 2.33 | 1 | 5953 |
| 2 | 2 | AirAsia | I5-764 | Delhi | Early_Morning | zero | Early_Morning | Mumbai | Economy | 2.17 | 1 | 5956 |
| 3 | 3 | Vistara | UK-995 | Delhi | Morning | zero | Afternoon | Mumbai | Economy | 2.25 | 1 | 5955 |
| 4 | 4 | Vistara | UK-963 | Delhi | Morning | zero | Morning | Mumbai | Economy | 2.33 | 1 | 5955 |
df1 = pd.read_csv('business.csv')
df2 = pd.read_csv('economy.csv')
df1.head()
| date | airline | ch_code | num_code | dep_time | from | time_taken | stop | arr_time | to | price | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 11-02-2022 | Air India | AI | 868 | 18:00 | Delhi | 02h 00m | non-stop | 20:00 | Mumbai | 25,612 |
| 1 | 11-02-2022 | Air India | AI | 624 | 19:00 | Delhi | 02h 15m | non-stop | 21:15 | Mumbai | 25,612 |
| 2 | 11-02-2022 | Air India | AI | 531 | 20:00 | Delhi | 24h 45m | 1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t... | 20:45 | Mumbai | 42,220 |
| 3 | 11-02-2022 | Air India | AI | 839 | 21:25 | Delhi | 26h 30m | 1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t... | 23:55 | Mumbai | 44,450 |
| 4 | 11-02-2022 | Air India | AI | 544 | 17:15 | Delhi | 06h 40m | 1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t... | 23:55 | Mumbai | 46,690 |
df2.head()
| date | airline | ch_code | num_code | dep_time | from | time_taken | stop | arr_time | to | price | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 11-02-2022 | SpiceJet | SG | 8709 | 18:55 | Delhi | 02h 10m | non-stop | 21:05 | Mumbai | 5,953 |
| 1 | 11-02-2022 | SpiceJet | SG | 8157 | 06:20 | Delhi | 02h 20m | non-stop | 08:40 | Mumbai | 5,953 |
| 2 | 11-02-2022 | AirAsia | I5 | 764 | 04:25 | Delhi | 02h 10m | non-stop | 06:35 | Mumbai | 5,956 |
| 3 | 11-02-2022 | Vistara | UK | 995 | 10:20 | Delhi | 02h 15m | non-stop | 12:35 | Mumbai | 5,955 |
| 4 | 11-02-2022 | Vistara | UK | 963 | 08:50 | Delhi | 02h 20m | non-stop | 11:10 | Mumbai | 5,955 |
df1['price'] = df1['price'].str.replace(',', '')
df1['price'] = df1['price'].astype(int)
df2['price'] = df2['price'].str.replace(',', '')
df2['price'] = df2['price'].astype(int)
df1['stop'] = df1['stop'].str.replace('\n', '')
df1['stop'] = df1['stop'].str.replace('\t', '')
df1['stop'].str.strip()
df2['stop'] = df2['stop'].str.replace('\n', '')
df2['stop'] = df2['stop'].str.replace('\t', '')
df2['stop'].str.strip()
0 non-stop
1 non-stop
2 non-stop
3 non-stop
4 non-stop
...
206769 1-stop
206770 1-stop
206771 1-stop
206772 1-stop
206773 1-stop
Name: stop, Length: 206774, dtype: object
df1['time_taken'] = df1['time_taken'].apply(lambda x: x.split("h")[0])
df1['time_taken'] = df1['time_taken'].astype(float)
df2['time_taken'] = df2['time_taken'].apply(lambda x: x.split("h")[0])
df2['time_taken'] = df2['time_taken'].astype(float)
df1['time_taken'] = df1['time_taken'].astype(int)
df2['time_taken'] = df2['time_taken'].astype(int)
df1['date'] = pd.to_datetime(df1['date'], format='%d-%m-%Y')
df1.dtypes
date datetime64[ns] airline object ch_code object num_code int64 dep_time object from object time_taken int64 stop object arr_time object to object price int64 dtype: object
df2['date'] = pd.to_datetime(df2['date'], format='%d-%m-%Y')
df2.dtypes
date datetime64[ns] airline object ch_code object num_code int64 dep_time object from object time_taken int64 stop object arr_time object to object price int64 dtype: object
print("Cleand Dataset Shape:", df.shape)
print("Business Dataset Shape:", df1.shape)
print("Economy Dataset Shape:", df2.shape)
Cleand Dataset Shape: (300153, 12) Business Dataset Shape: (93487, 11) Economy Dataset Shape: (206774, 11)
df1['class'] = 'Business'
df2['class'] = 'Economy'
display(df2.head())
df.head()
| date | airline | ch_code | num_code | dep_time | from | time_taken | stop | arr_time | to | price | class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2022-02-11 | SpiceJet | SG | 8709 | 18:55 | Delhi | 2 | non-stop | 21:05 | Mumbai | 5953 | Economy |
| 1 | 2022-02-11 | SpiceJet | SG | 8157 | 06:20 | Delhi | 2 | non-stop | 08:40 | Mumbai | 5953 | Economy |
| 2 | 2022-02-11 | AirAsia | I5 | 764 | 04:25 | Delhi | 2 | non-stop | 06:35 | Mumbai | 5956 | Economy |
| 3 | 2022-02-11 | Vistara | UK | 995 | 10:20 | Delhi | 2 | non-stop | 12:35 | Mumbai | 5955 | Economy |
| 4 | 2022-02-11 | Vistara | UK | 963 | 08:50 | Delhi | 2 | non-stop | 11:10 | Mumbai | 5955 | Economy |
| Unnamed: 0 | airline | flight | source_city | departure_time | stops | arrival_time | destination_city | class | duration | days_left | price | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | SpiceJet | SG-8709 | Delhi | Evening | zero | Night | Mumbai | Economy | 2.17 | 1 | 5953 |
| 1 | 1 | SpiceJet | SG-8157 | Delhi | Early_Morning | zero | Morning | Mumbai | Economy | 2.33 | 1 | 5953 |
| 2 | 2 | AirAsia | I5-764 | Delhi | Early_Morning | zero | Early_Morning | Mumbai | Economy | 2.17 | 1 | 5956 |
| 3 | 3 | Vistara | UK-995 | Delhi | Morning | zero | Afternoon | Mumbai | Economy | 2.25 | 1 | 5955 |
| 4 | 4 | Vistara | UK-963 | Delhi | Morning | zero | Morning | Mumbai | Economy | 2.33 | 1 | 5955 |
new_df=pd.concat([df1, df2])
new_df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 300261 entries, 0 to 206773 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 date 300261 non-null datetime64[ns] 1 airline 300261 non-null object 2 ch_code 300261 non-null object 3 num_code 300261 non-null int64 4 dep_time 300261 non-null object 5 from 300261 non-null object 6 time_taken 300261 non-null int64 7 stop 300261 non-null object 8 arr_time 300261 non-null object 9 to 300261 non-null object 10 price 300261 non-null int64 11 class 300261 non-null object dtypes: datetime64[ns](1), int64(3), object(8) memory usage: 29.8+ MB
new_df.date.value_counts()
2022-03-07 6633 2022-02-28 6603 2022-03-21 6594 2022-03-14 6586 2022-03-08 6577 2022-03-06 6543 2022-03-01 6541 2022-03-15 6536 2022-03-13 6536 2022-03-22 6535 2022-03-10 6527 2022-03-23 6526 2022-03-20 6513 2022-03-12 6505 2022-03-24 6503 2022-03-02 6502 2022-03-04 6495 2022-03-18 6491 2022-03-03 6483 2022-03-19 6480 2022-03-25 6473 2022-03-26 6440 2022-02-27 6420 2022-02-21 6418 2022-03-16 6412 2022-03-05 6405 2022-02-23 6404 2022-03-11 6398 2022-02-22 6385 2022-03-09 6361 2022-02-24 6353 2022-02-25 6342 2022-03-27 6314 2022-03-17 6296 2022-02-26 6279 2022-03-28 6160 2022-03-31 6157 2022-03-30 6078 2022-03-29 6072 2022-02-20 5823 2022-02-18 5768 2022-02-16 5740 2022-02-17 5703 2022-02-19 5669 2022-02-15 5395 2022-02-14 5079 2022-02-13 4250 2022-02-12 4031 2022-02-11 1927 Name: date, dtype: int64
new_df.head()
| date | airline | ch_code | num_code | dep_time | from | time_taken | stop | arr_time | to | price | class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2022-02-11 | Air India | AI | 868 | 18:00 | Delhi | 2 | non-stop | 20:00 | Mumbai | 25612 | Business |
| 1 | 2022-02-11 | Air India | AI | 624 | 19:00 | Delhi | 2 | non-stop | 21:15 | Mumbai | 25612 | Business |
| 2 | 2022-02-11 | Air India | AI | 531 | 20:00 | Delhi | 24 | 1-stop | 20:45 | Mumbai | 42220 | Business |
| 3 | 2022-02-11 | Air India | AI | 839 | 21:25 | Delhi | 26 | 1-stop | 23:55 | Mumbai | 44450 | Business |
| 4 | 2022-02-11 | Air India | AI | 544 | 17:15 | Delhi | 6 | 1-stop | 23:55 | Mumbai | 46690 | Business |
new_df.ch_code.value_counts()
UK 127859 AI 80894 6E 43120 G8 23177 I5 16098 SG 9011 S5 61 2T 41 Name: ch_code, dtype: int64
new_df['num_code'].value_counts()
808 3313
706 3235
772 2860
774 2808
720 2650
...
1058 1
6613 1
405 1
9923 1
8913 1
Name: num_code, Length: 1255, dtype: int64
new_df.stop.value_counts()
1-stop 243603 non-stop 36044 2+-stop 13288 1-stopVia IXU 1839 1-stopVia IDR 1398 1-stopVia Patna 674 1-stopVia Indore 381 1-stopVia PAT 354 1-stopVia MYQ 321 1-stopVia Bhubaneswar 301 1-stopVia KLH 284 1-stopVia JGB 193 1-stopVia JRG 175 1-stopVia STV 169 1-stopVia BBI 158 1-stopVia Delhi 153 1-stopVia Hyderabad 143 1-stopVia IXE 120 1-stopVia Ranchi 114 1-stopVia Raipur 102 1-stopVia Chennai 77 1-stopVia Guwahati 55 1-stopVia Mysore 45 1-stopVia Mangalore 39 1-stopVia Mumbai 31 1-stopVia Kolkata 28 1-stopVia VTZ 27 1-stopVia Nagpur 25 1-stopVia RPR 22 1-stopVia NDC 22 1-stopVia GOP 16 1-stopVia Surat 14 1-stopVia Lucknow 13 1-stopVia NAG 9 1-stopVia Vishakhapatnam 8 1-stopVia Kolhapur 5 1-stopVia IXR 4 1-stopVia GAU 3 1-stopVia GAY 3 1-stopVia HYD 1 Name: stop, dtype: int64
df['flight'].value_counts()
UK-706 3235
UK-772 2741
UK-720 2650
UK-836 2542
UK-822 2468
...
6E-2939 1
SG-9923 1
6E-865 1
SG-8106 1
SG-8480 1
Name: flight, Length: 1561, dtype: int64
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 300153 entries, 0 to 300152 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 300153 non-null int64 1 airline 300153 non-null object 2 flight 300153 non-null object 3 source_city 300153 non-null object 4 departure_time 300153 non-null object 5 stops 300153 non-null object 6 arrival_time 300153 non-null object 7 destination_city 300153 non-null object 8 class 300153 non-null object 9 duration 300153 non-null float64 10 days_left 300153 non-null int64 11 price 300153 non-null int64 dtypes: float64(1), int64(3), object(8) memory usage: 27.5+ MB
df3=df.drop(['Unnamed: 0'],axis=1)
df3.head()
| airline | flight | source_city | departure_time | stops | arrival_time | destination_city | class | duration | days_left | price | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | SpiceJet | SG-8709 | Delhi | Evening | zero | Night | Mumbai | Economy | 2.17 | 1 | 5953 |
| 1 | SpiceJet | SG-8157 | Delhi | Early_Morning | zero | Morning | Mumbai | Economy | 2.33 | 1 | 5953 |
| 2 | AirAsia | I5-764 | Delhi | Early_Morning | zero | Early_Morning | Mumbai | Economy | 2.17 | 1 | 5956 |
| 3 | Vistara | UK-995 | Delhi | Morning | zero | Afternoon | Mumbai | Economy | 2.25 | 1 | 5955 |
| 4 | Vistara | UK-963 | Delhi | Morning | zero | Morning | Mumbai | Economy | 2.33 | 1 | 5955 |
df3['days_left'] = df3['days_left'].astype(int)
df3.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 300153 entries, 0 to 300152 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 airline 300153 non-null object 1 flight 300153 non-null object 2 source_city 300153 non-null object 3 departure_time 300153 non-null object 4 stops 300153 non-null object 5 arrival_time 300153 non-null object 6 destination_city 300153 non-null object 7 class 300153 non-null object 8 duration 300153 non-null float64 9 days_left 300153 non-null int64 10 price 300153 non-null int64 dtypes: float64(1), int64(2), object(8) memory usage: 25.2+ MB
df3.duration.value_counts().sort_values()
41.50 1
39.25 1
35.67 1
36.25 1
41.58 1
...
2.83 2323
2.08 2755
2.75 2879
2.25 4036
2.17 4242
Name: duration, Length: 476, dtype: int64
df3.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| duration | 300153.0 | 12.221021 | 7.191997 | 0.83 | 6.83 | 11.25 | 16.17 | 49.83 |
| days_left | 300153.0 | 26.004751 | 13.561004 | 1.00 | 15.00 | 26.00 | 38.00 | 49.00 |
| price | 300153.0 | 20889.660523 | 22697.767366 | 1105.00 | 4783.00 | 7425.00 | 42521.00 | 123071.00 |
df.isnull().sum()
Unnamed: 0 0 airline 0 flight 0 source_city 0 departure_time 0 stops 0 arrival_time 0 destination_city 0 class 0 duration 0 days_left 0 price 0 dtype: int64
sns.heatmap(df1.isnull())
<AxesSubplot:>
df3['source_city'].value_counts()
Delhi 61343 Mumbai 60896 Bangalore 52061 Kolkata 46347 Hyderabad 40806 Chennai 38700 Name: source_city, dtype: int64
df3['destination_city'].value_counts()
Mumbai 59097 Delhi 57360 Bangalore 51068 Kolkata 49534 Hyderabad 42726 Chennai 40368 Name: destination_city, dtype: int64
df3['stops'].unique()
array(['zero', 'one', 'two_or_more'], dtype=object)
new_df.corr().style.background_gradient()
| num_code | time_taken | price | |
|---|---|---|---|
| num_code | 1.000000 | -0.133619 | -0.212865 |
| time_taken | -0.133619 | 1.000000 | 0.205324 |
| price | -0.212865 | 0.205324 | 1.000000 |
plt.figure(figsize=(20, 10))
sns.countplot(x='airline',hue='class' ,data=new_df).set(title='airline count based on class')
[Text(0.5, 1.0, 'airline count based on class')]
# From graph we can see that Jet Vistara have the highest Price.
# Apart from the first Airline almost all are having similar median
# Airline vs Price
sns.catplot(y = "price", x = "airline", data = df3.sort_values("price", ascending = False), kind="boxen", height = 6, aspect = 3)
plt.show()
fig, axes = plt.subplots(1,2, figsize=(20,10))
sns.barplot(x='source_city', y='price', data=df3.sort_values('source_city', axis=0), ax=axes[0])
axes[0].set(title='Price variation based on source city')
sns.barplot(x='destination_city', y='price', data=df3.sort_values('destination_city', axis=0), ax=axes[1])
axes[1].set(title='Price variation based on destination city');
sns.relplot(col="airline", y="price", kind="line",x='stops', data=df3, col_wrap=2)
<seaborn.axisgrid.FacetGrid at 0x7fc83e20c5e0>
Vsitara and Air India are the most expensive airlines, as we can see the one-stop condition is the highest price among the others.
fig, axes = plt.subplots(1,2, figsize=(20,10))
sns.barplot(x='airline', y='price', data=df3.sort_values('airline', axis=0), ax=axes[0])
axes[0].set(title='Price variation based on airline')
sns.countplot(x='airline',hue='stops', data=df3.sort_values('stops', axis=0), ax=axes[1])
axes[1].set(title='Airline count based on stop');
plt.figure(figsize=(20, 10))
sns.countplot(x='source_city',hue='stops',data=df3).set(title='Source city count based on stops')
[Text(0.5, 1.0, 'Source city count based on stops')]
plt.figure(figsize=(20, 10))
sns.countplot(x='destination_city',hue='stops',data=df3).set(title='Destination city count based on stops')
[Text(0.5, 1.0, 'Destination city count based on stops')]
# Source vs Price
sns.catplot(y = "price", x = "source_city", data = df3.sort_values("price", ascending = False), kind="boxen", height = 4, aspect = 3)
plt.show()
plt.figure(figsize=(20, 10))
sns.countplot(x='arrival_time',hue='stops',data=df3).set(title='Arrival time count based on stops')
[Text(0.5, 1.0, 'Arrival time count based on stops')]
plt.figure(figsize=(20, 10))
sns.countplot(x='departure_time',hue='stops',data=df3).set(title='Departure time count based on stops')
[Text(0.5, 1.0, 'Departure time count based on stops')]
Night on the arrival time and Morning on the departure time.plt.figure(figsize=(20, 10))
sns.countplot(x='ch_code', data=new_df).set(title='Airways service count')
[Text(0.5, 1.0, 'Airways service count')]
plt.figure(figsize=(20, 10))
sns.barplot(x='ch_code', y='price' ,data=new_df).set(title='Price variation based on airways service')
[Text(0.5, 1.0, 'Price variation based on airways service')]
UK airways service is the most expensive airways and the most common one.
plt.figure(figsize=(20, 15))
sns.stripplot(x='airline',y='price',hue='days_left',data=df3).set(title='Price variation based on airline respect to days left')
[Text(0.5, 1.0, 'Price variation based on airline respect to days left')]
Question) Does the ticket booking date affect the price?
The price changes depending on the date of booking the ticket, especially if it is booked a day or two before the flight date, the price will become higher. Also there is a drop in prices one day before the departure.
sns.relplot(col="departure_time", y="price", kind="line",x='arrival_time', data=df3, col_wrap=3)
<seaborn.axisgrid.FacetGrid at 0x7fc861e43400>
Question) Do the times of the day affect the price?
The price changing based on the departure time and arrival time we can see some patterns such as when departure time is evening and Arrival time is night this is lowering the price.
sns.relplot(col="source_city", y="price", kind="line",x='destination_city', data=df3, col_wrap=3)
<seaborn.axisgrid.FacetGrid at 0x7fc873d2beb0>
Question) Does the destination and source city affect the price?
Yes, and we can see from the above some information such Delhi price will be high if your destination is bangalore.
plt.figure(figsize=(20, 10))
sns.barplot(x='airline', y='price',hue='class' ,data=df3).set(title='Price variation based on airline respect to class')
[Text(0.5, 1.0, 'Price variation based on airline respect to class')]
plt.figure(figsize=(20, 10))
sns.barplot(x='airline',y='price',hue='stops',data=df3).set(title='Price variation based on airline respect to stops')
[Text(0.5, 1.0, 'Price variation based on airline respect to stops')]
Question) Does the stops affect the price?
Yes, and we can see the price is much lower in flight that has zero stops, and for both
VistaraandAir Indiathe one stop is much higher price than other airlines.
fig, axes = plt.subplots(1,2, figsize=(20,10))
sns.barplot(x='airline', y='price',hue='class' ,data=df1, ax=axes[0])
axes[0].set(title='Business class price variation based on airline')
sns.barplot(x='airline', y='price',hue='class' ,data=df2, ax=axes[1])
axes[1].set(title='Economy class price variation based on airline')
[Text(0.5, 1.0, 'Economy class price variation based on airline')]
Vistara and the second is Air India in both class.
trace1 = go.Scatter(
x = df1.date,
y = df1.price,
mode = "lines",
name = "citations",
marker = dict(color = 'rgba(16, 112, 2, 0.8)'),
text= df1.to)
data = [trace1]
layout = dict(title = 'Price variation with dates destnation cities on business class',
xaxis= dict(title= 'Price Variation',ticklen= 5,zeroline= False)
)
fig = dict(data = data, layout = layout)
iplot(fig)
trace1 = go.Scatter(
x = df2.date,
y = df2.price,
mode = "lines",
name = "citations",
marker = dict(color = 'rgba(16, 112, 2, 0.8)'),
text= df2.to)
data = [trace1]
layout = dict(title = 'Price variation with dates destnation cities on economy class',
xaxis= dict(title= 'Price Variation',ticklen= 5,zeroline= False)
)
fig = dict(data = data, layout = layout)
iplot(fig)
trace1 = go.Scatter(
x = df1.date,
y = df1.price,
mode = "lines",
name = "citations",
marker = dict(color = 'rgba(16, 112, 2, 0.8)'),
text= df1['from'])
data = [trace1]
layout = dict(title = 'Price variation with dates departure cities on business class',
xaxis= dict(title= 'Price Variation',ticklen= 5,zeroline= False)
)
fig = dict(data = data, layout = layout)
iplot(fig)
trace1 = go.Scatter(
x = df2.date,
y = df2.price,
mode = "lines",
name = "citations",
marker = dict(color = 'rgba(16, 112, 2, 0.8)'),
text= df2['from'])
data = [trace1]
layout = dict(title = 'Price variation with dates departure cities on economy class',
xaxis= dict(title= 'Price Variation',ticklen= 5,zeroline= False)
)
fig = dict(data = data, layout = layout)
iplot(fig)
from matplotlib.dates import DateFormatter
fig, ax = plt.subplots(figsize=(20, 10))
sns.lineplot(x='date', y='price',hue='to' ,data=df1, ci=None).set(title="date and time respect to destantion city in business class")
date_form = DateFormatter("%m-%d")
ax.xaxis.set_major_formatter(date_form)
Kolkata is the highest destination city price among the others in business class.
from matplotlib.dates import DateFormatter
fig, ax = plt.subplots(figsize=(20, 10))
sns.lineplot(x='date', y='price',hue='from' ,data=df1, ci=None).set(title="date and time respect to depature city in business class")
date_form = DateFormatter("%m-%d")
ax.xaxis.set_major_formatter(date_form)
Kolkata is the highest departure city price among the others in business class.
from matplotlib.dates import DateFormatter
fig, ax = plt.subplots(figsize=(20, 10))
sns.lineplot(x='date', y='price',hue='to' ,data=df2, ci=None).set(title="date and time respect to destination city in economy class")
date_form = DateFormatter("%m-%d")
ax.xaxis.set_major_formatter(date_form)
Kolkata is the highest departure city price among the others in economy class, and there are maybe some seasons on dates before 02-22 because the prices are much higher on this dates.
from matplotlib.dates import DateFormatter
fig, ax = plt.subplots(figsize=(20, 10))
sns.lineplot(x='date', y='price',hue='from' ,data=df2, ci=None).set(title="date and time respect to departure city in business class")
date_form = DateFormatter("%m-%d")
ax.xaxis.set_major_formatter(date_form)
Kolkata is the highest destination city price among the others in economy class, and there are maybe some seasons on dates before 02-22 because the prices are much higher on this dates.
dataframe = new_df.set_index("date")
dataframe
| airline | ch_code | num_code | dep_time | from | time_taken | stop | arr_time | to | price | class | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| date | |||||||||||
| 2022-02-11 | Air India | AI | 868 | 18:00 | Delhi | 2 | non-stop | 20:00 | Mumbai | 25612 | Business |
| 2022-02-11 | Air India | AI | 624 | 19:00 | Delhi | 2 | non-stop | 21:15 | Mumbai | 25612 | Business |
| 2022-02-11 | Air India | AI | 531 | 20:00 | Delhi | 24 | 1-stop | 20:45 | Mumbai | 42220 | Business |
| 2022-02-11 | Air India | AI | 839 | 21:25 | Delhi | 26 | 1-stop | 23:55 | Mumbai | 44450 | Business |
| 2022-02-11 | Air India | AI | 544 | 17:15 | Delhi | 6 | 1-stop | 23:55 | Mumbai | 46690 | Business |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2022-03-31 | Vistara | UK | 832 | 07:05 | Chennai | 13 | 1-stop | 20:55 | Hyderabad | 7697 | Economy |
| 2022-03-31 | Vistara | UK | 832 | 07:05 | Chennai | 13 | 1-stop | 20:55 | Hyderabad | 7709 | Economy |
| 2022-03-31 | Vistara | UK | 826 | 12:30 | Chennai | 20 | 1-stop | 09:05 | Hyderabad | 8640 | Economy |
| 2022-03-31 | Vistara | UK | 822 | 09:45 | Chennai | 23 | 1-stop | 09:05 | Hyderabad | 8640 | Economy |
| 2022-03-31 | Vistara | UK | 824 | 20:30 | Chennai | 24 | 1-stop | 20:55 | Hyderabad | 8640 | Economy |
300261 rows × 11 columns
Chennai and Kolkata.
df3['price_per_minute'] = df3['price']/df3['duration']*60
plt.figure(figsize=(20,8))
plt.subplot(1,2,1)
plt.title('Price Distribution Plot')
sns.kdeplot(df3.price)
plt.subplot(1,2,2)
plt.title('Price per minute Distribution Plot')
sns.kdeplot(df3.price_per_minute)
plt.show()
df3.columns
Index(['airline', 'flight', 'source_city', 'departure_time', 'stops',
'arrival_time', 'destination_city', 'class', 'duration', 'days_left',
'price', 'price_per_minute'],
dtype='object')
df3.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 300153 entries, 0 to 300152 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 airline 300153 non-null object 1 flight 300153 non-null object 2 source_city 300153 non-null object 3 departure_time 300153 non-null object 4 stops 300153 non-null object 5 arrival_time 300153 non-null object 6 destination_city 300153 non-null object 7 class 300153 non-null object 8 duration 300153 non-null float64 9 days_left 300153 non-null int64 10 price 300153 non-null int64 11 price_per_minute 300153 non-null float64 dtypes: float64(2), int64(2), object(8) memory usage: 27.5+ MB
df3['departure_time'].value_counts()
Morning 71146 Early_Morning 66790 Evening 65102 Night 48015 Afternoon 47794 Late_Night 1306 Name: departure_time, dtype: int64
df3['arrival_time'].value_counts()
Night 91538 Evening 78323 Morning 62735 Afternoon 38139 Early_Morning 15417 Late_Night 14001 Name: arrival_time, dtype: int64
df3['source_city'].value_counts()
Delhi 61343 Mumbai 60896 Bangalore 52061 Kolkata 46347 Hyderabad 40806 Chennai 38700 Name: source_city, dtype: int64
df3['stops'].value_counts()
one 250863 zero 36004 two_or_more 13286 Name: stops, dtype: int64
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score, mean_absolute_error
from sklearn.preprocessing import LabelEncoder,OrdinalEncoder,MinMaxScaler
# As airline is Nominal Categorical data we will perform OneHotEncoding
airline = df3[["airline"]]
airline = pd.get_dummies(airline, drop_first= True)
airline.head()
| airline_Air_India | airline_GO_FIRST | airline_Indigo | airline_SpiceJet | airline_Vistara | |
|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 1 | 0 |
| 1 | 0 | 0 | 0 | 1 | 0 |
| 2 | 0 | 0 | 0 | 0 | 0 |
| 3 | 0 | 0 | 0 | 0 | 1 |
| 4 | 0 | 0 | 0 | 0 | 1 |
# As source city is Nominal Categorical data we will perform OneHotEncoding
source = df3[["source_city"]]
source = pd.get_dummies(source, drop_first= True)
source.head()
| source_city_Chennai | source_city_Delhi | source_city_Hyderabad | source_city_Kolkata | source_city_Mumbai | |
|---|---|---|---|---|---|
| 0 | 0 | 1 | 0 | 0 | 0 |
| 1 | 0 | 1 | 0 | 0 | 0 |
| 2 | 0 | 1 | 0 | 0 | 0 |
| 3 | 0 | 1 | 0 | 0 | 0 |
| 4 | 0 | 1 | 0 | 0 | 0 |
# As departure time is Nominal Categorical data we will perform OneHotEncoding
departure = df3[["departure_time"]]
departure = pd.get_dummies(departure, drop_first= True)
departure.head()
| departure_time_Early_Morning | departure_time_Evening | departure_time_Late_Night | departure_time_Morning | departure_time_Night | |
|---|---|---|---|---|---|
| 0 | 0 | 1 | 0 | 0 | 0 |
| 1 | 1 | 0 | 0 | 0 | 0 |
| 2 | 1 | 0 | 0 | 0 | 0 |
| 3 | 0 | 0 | 0 | 1 | 0 |
| 4 | 0 | 0 | 0 | 1 | 0 |
# As departure time is Nominal Categorical data we will perform OneHotEncoding
arrival = df3[["arrival_time"]]
arrival = pd.get_dummies(arrival, drop_first= True)
arrival.head()
| arrival_time_Early_Morning | arrival_time_Evening | arrival_time_Late_Night | arrival_time_Morning | arrival_time_Night | |
|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 1 |
| 1 | 0 | 0 | 0 | 1 | 0 |
| 2 | 1 | 0 | 0 | 0 | 0 |
| 3 | 0 | 0 | 0 | 0 | 0 |
| 4 | 0 | 0 | 0 | 1 | 0 |
# As destenation city is Nominal Categorical data we will perform OneHotEncoding
destination = df3[["destination_city"]]
destination = pd.get_dummies(destination, drop_first= True)
destination.head()
| destination_city_Chennai | destination_city_Delhi | destination_city_Hyderabad | destination_city_Kolkata | destination_city_Mumbai | |
|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 1 |
| 1 | 0 | 0 | 0 | 0 | 1 |
| 2 | 0 | 0 | 0 | 0 | 1 |
| 3 | 0 | 0 | 0 | 0 | 1 |
| 4 | 0 | 0 | 0 | 0 | 1 |
df3.replace({"zero": 0, "one": 1, "two_or_more": 2}, inplace = True)
df3.replace({"Economy": 0, "Business": 1}, inplace = True)
df4 = pd.concat([df3, airline, source, departure, arrival, destination], axis = 1)
df4.head()
| airline | flight | source_city | departure_time | stops | arrival_time | destination_city | class | duration | days_left | ... | arrival_time_Early_Morning | arrival_time_Evening | arrival_time_Late_Night | arrival_time_Morning | arrival_time_Night | destination_city_Chennai | destination_city_Delhi | destination_city_Hyderabad | destination_city_Kolkata | destination_city_Mumbai | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | SpiceJet | SG-8709 | Delhi | Evening | 0 | Night | Mumbai | 0 | 2.17 | 1 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
| 1 | SpiceJet | SG-8157 | Delhi | Early_Morning | 0 | Morning | Mumbai | 0 | 2.33 | 1 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
| 2 | AirAsia | I5-764 | Delhi | Early_Morning | 0 | Early_Morning | Mumbai | 0 | 2.17 | 1 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 3 | Vistara | UK-995 | Delhi | Morning | 0 | Afternoon | Mumbai | 0 | 2.25 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 4 | Vistara | UK-963 | Delhi | Morning | 0 | Morning | Mumbai | 0 | 2.33 | 1 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
5 rows × 37 columns
df4.drop(["airline", "flight", 'source_city', "departure_time", 'arrival_time', 'destination_city', 'price_per_minute'], axis = 1, inplace = True)
df4.head()
| stops | class | duration | days_left | price | airline_Air_India | airline_GO_FIRST | airline_Indigo | airline_SpiceJet | airline_Vistara | ... | arrival_time_Early_Morning | arrival_time_Evening | arrival_time_Late_Night | arrival_time_Morning | arrival_time_Night | destination_city_Chennai | destination_city_Delhi | destination_city_Hyderabad | destination_city_Kolkata | destination_city_Mumbai | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 2.17 | 1 | 5953 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
| 1 | 0 | 0 | 2.33 | 1 | 5953 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
| 2 | 0 | 0 | 2.17 | 1 | 5956 | 0 | 0 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 3 | 0 | 0 | 2.25 | 1 | 5955 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 4 | 0 | 0 | 2.33 | 1 | 5955 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
5 rows × 30 columns
X = df4.drop(['price'], axis=1)
y = df4['price']
X
| stops | class | duration | days_left | airline_Air_India | airline_GO_FIRST | airline_Indigo | airline_SpiceJet | airline_Vistara | source_city_Chennai | ... | arrival_time_Early_Morning | arrival_time_Evening | arrival_time_Late_Night | arrival_time_Morning | arrival_time_Night | destination_city_Chennai | destination_city_Delhi | destination_city_Hyderabad | destination_city_Kolkata | destination_city_Mumbai | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 2.17 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
| 1 | 0 | 0 | 2.33 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
| 2 | 0 | 0 | 2.17 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 3 | 0 | 0 | 2.25 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 4 | 0 | 0 | 2.33 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 300148 | 1 | 1 | 10.08 | 49 | 0 | 0 | 0 | 0 | 1 | 1 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 300149 | 1 | 1 | 10.42 | 49 | 0 | 0 | 0 | 0 | 1 | 1 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 |
| 300150 | 1 | 1 | 13.83 | 49 | 0 | 0 | 0 | 0 | 1 | 1 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 |
| 300151 | 1 | 1 | 10.00 | 49 | 0 | 0 | 0 | 0 | 1 | 1 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 300152 | 1 | 1 | 10.08 | 49 | 0 | 0 | 0 | 0 | 1 | 1 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
300153 rows × 29 columns
# Important feature using ExtraTreesRegressor
from sklearn.ensemble import ExtraTreesRegressor
selection = ExtraTreesRegressor()
selection.fit(X, y)
ExtraTreesRegressor()
print(selection.feature_importances_)
[3.37760148e-02 8.80372309e-01 2.30366296e-02 1.87492358e-02 5.43715482e-03 1.83641420e-04 1.67105875e-04 1.37560858e-04 5.51644254e-03 1.04014608e-03 4.21120594e-03 1.49153286e-03 2.01313766e-03 2.36546597e-03 1.09602817e-03 1.47663936e-03 5.34244548e-05 1.34169382e-03 9.94190954e-04 9.38228052e-04 1.49662651e-03 2.22150374e-04 1.06744649e-03 1.75736200e-03 1.04105638e-03 4.10354975e-03 1.92193639e-03 1.97151964e-03 2.02056403e-03]
#plot graph of feature importances for better visualization
plt.figure(figsize = (12,8))
feat_importances = pd.Series(selection.feature_importances_, index=X.columns)
feat_importances.nlargest(20).plot(kind='barh')
plt.show()
#cat_columns = ['airline','source_city','departure_time','stops','arrival_time','destination_city','class']
#num_columns = ['duration','days_left']
#encoder = OrdinalEncoder().fit_transform(df3[cat_columns])
#encoder = pd.DataFrame(encoder,columns = cat_columns)
#X = pd.concat([encoder,df3[num_columns]],axis=1)
#y = df3['price']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state= 42)
model = LinearRegression()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
print(y_pred)
[ 3151.37494602 54895.59810797 9856.93848654 ... 4522.1445811 -397.39426783 57811.06252282]
print("R2 Score: ",r2_score(y_test,y_pred))
print("Mean Squared Error: ",mean_squared_error(y_test, y_pred))
print('Mean Absolute Error', mean_absolute_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred)))
R2 Score: 0.9099031138535913 Mean Squared Error: 46443347.71222658 Mean Absolute Error 4500.712501541862 Root Mean Squared Error: 6814.935635222579
from sklearn.ensemble import GradientBoostingRegressor
gbre = GradientBoostingRegressor()
gbre.fit(X_train, y_train)
y_pred = gbre.predict(X_test)
print(y_pred)
[ 5099.25271303 62513.40546999 8045.78944625 ... 5314.91661395 2687.44731556 65815.10665754]
print("R2 Score: ",r2_score(y_test,y_pred))
print("Mean Squared Error: ",mean_squared_error(y_test, y_pred))
print('Mean Absolute Error', mean_absolute_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred)))
R2 Score: 0.9514799890396428 Mean Squared Error: 25011205.563431542 Mean Absolute Error 2972.056919745379 Root Mean Squared Error: 5001.120430806635
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor()
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)
print(y_pred)
[ 7349.2 70802.84 6195. ... 6496.84 3757.18 69950.71]
print("R2 Score: ",r2_score(y_test,y_pred))
print("Mean Squared Error: ",mean_squared_error(y_test, y_pred))
print('Mean Absolute Error', mean_absolute_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred)))
R2 Score: 0.9848578081522366 Mean Squared Error: 7805531.48050116 Mean Absolute Error 1083.9322441045008 Root Mean Squared Error: 2793.8381271113685
plt.figure(figsize = (8,8))
sns.kdeplot(y_test-y_pred)
plt.show()
plt.figure(figsize = (8,8))
plt.scatter(y_test, y_pred, alpha = 0.5)
plt.xlabel("y_test")
plt.ylabel("y_pred")
plt.show()
Random Forest Regressor is the highest model based on the metrics.
import pickle
# open a file, where you ant to store the data
file = open('flight_rf.pkl', 'wb')
# dump information to that file
pickle.dump(regr, file)
model = open('flight_rf.pkl','rb')
forest = pickle.load(model)
y_prediction = forest.predict(X_test)
r2_score(y_test, y_prediction)
0.9848578081522366